Archivos y Bases de datos


In [1]:
import mysql.connector

La idea de este taller es manipular archivos (leerlos, parsearlos y escribirlos) y hacer lo mismo con bases de datos estructuradas.

Ejercicio 1

Baje el archivo de "All associations with added ontology annotations" del GWAS Catalog.

Describa las columnas del archivo (que información estamos mirando? Para qué sirve? Por qué la hicieron?)


In [2]:
import pandas as pd
df= pd.read_csv('C:/Users/Alex/Documents/eafit/semestres/X semestre/programacion/taller2.tsv', sep = '\t')
df[:1]


C:\Users\Alex\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2717: DtypeWarning: Columns (12,23,27) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Out[2]:
DATE ADDED TO CATALOG PUBMEDID FIRST AUTHOR DATE JOURNAL LINK STUDY DISEASE/TRAIT INITIAL SAMPLE SIZE REPLICATION SAMPLE SIZE ... CONTEXT INTERGENIC RISK ALLELE FREQUENCY P-VALUE PVALUE_MLOG P-VALUE (TEXT) OR or BETA 95% CI (TEXT) PLATFORM [SNPS PASSING QC] CNV
0 2009-09-28 18403759 Ober C 2008-04-09 N Engl J Med www.ncbi.nlm.nih.gov/pubmed/18403759 Effect of variation in CHI3L1 on serum YKL-40 ... YKL-40 levels 632 Hutterite individuals 443 European ancestry cases, 491 European ance... ... upstream_gene_variant 0.0 0.29 1e-13 13.0 NaN 0.3 [NR] ng/ml decrease Affymetrix [290325] N

1 rows × 34 columns

Qué Entidades (tablas) puede definir?

  1. Enfermedad
  2. Plataforma (tecnologia de secuenciacion)
  3. Loci
  4. Enfermedad-loci
  5. Journal
  6. Estudio
  7. Publicacion

Cree la base de datos (copie el código SQL que se usó)


In [5]:
CREATE TABLE enfermedad
(
id_enfermedad int AUTO_INCREMENT PRIMARY KEY,
nombre varchar(255)
);

create table plataforma
(
id_plataforma int AUTO_INCREMENT primary key, 
nombre varchar(255)
);

CREATE TABLE loci
(
id_loci int AUTO_INCREMENT PRIMARY KEY,
id_plataforma int,
foreign key (id_plataforma) references plataforma(id_plataforma),
region varchar(255),
chrom varchar(255), 
pos varchar(255),
genes_reportados varchar(255),
gen_mapped varchar(255),
gen_upstream varchar(255),
gen_downstream varchar(255),
SNP_GENE_IDS varchar(255),
UPSTREAM_GENE_DISTANCE varchar(255),
DOWNSTREAM_GENE_DISTANCE varchar(255),
STRONGEST_SP_RISK varchar(255),
SNPS varchar(255),
MERGED varchar(255),
SNP_ID_CURRENT varchar(255),
CONTEXTO varchar(500),
risk_allele varchar(255)
#PVAl int,
#PVALUE_MLOG int,
#PVALUE_txt varchar(255),
#BETA int,  
#novCI varchar(255)
);

CREATE TABLE enfermedad_loci
(
id_loci int ,
id_enfermedad int,
PRIMARY KEY (id_loci, id_enfermedad),
foreign key (id_loci) references loci(id_loci),
foreign key (id_enfermedad) references enfermedad(id_enfermedad)
);

CREATE TABLE journal
(
id_journal int AUTO_INCREMENT PRIMARY KEY,
nombre varchar(255)
);

CREATE TABLE estudio
(
id_estudio int AUTO_INCREMENT PRIMARY KEY,
id_journal int,
id_enfermedad int,
foreign key (id_journal) references journal(id_journal),
foreign key (id_enfermedad) references enfermedad(id_enfermedad),
nombre varchar(255),
tamano_muestra varchar(500),
replicas varchar(500)
);

create table publicacion
(
id_publicacion int AUTO_INCREMENT PRIMARY KEY,
id_journal int,
id_estudio int,
foreign key (id_journal) references journal(id_journal),
foreign key (id_estudio) references estudio(id_estudio),
id_pubmed int,
autor varchar (255),
fecha_pub varchar (20),
link varchar (500)
);


  File "<ipython-input-5-3975823bd552>", line 1
    CREATE TABLE enfermedad
               ^
SyntaxError: invalid syntax

Ejercicio 2

Lea el archivo y guarde la infomación en la base de datos en las tablas que se definidieron en el Ejercicio 1.


In [6]:
df.head(1)


Out[6]:
DATE ADDED TO CATALOG PUBMEDID FIRST AUTHOR DATE JOURNAL LINK STUDY DISEASE/TRAIT INITIAL SAMPLE SIZE REPLICATION SAMPLE SIZE ... CONTEXT INTERGENIC RISK ALLELE FREQUENCY P-VALUE PVALUE_MLOG P-VALUE (TEXT) OR or BETA 95% CI (TEXT) PLATFORM [SNPS PASSING QC] CNV
0 2009-09-28 18403759 Ober C 2008-04-09 N Engl J Med www.ncbi.nlm.nih.gov/pubmed/18403759 Effect of variation in CHI3L1 on serum YKL-40 ... YKL-40 levels 632 Hutterite individuals 443 European ancestry cases, 491 European ance... ... upstream_gene_variant 0.0 0.29 1e-13 13.0 NaN 0.3 [NR] ng/ml decrease Affymetrix [290325] N

1 rows × 34 columns


In [7]:
hostname = '127.0.0.1'
username = 'alexacl95'
password = 'SUSAna05'
database = 'programacion'

def doQuery( conn ) :
    cur = conn.cursor()

    cur.execute( "select * from enfermedad_loci" )

    for id_nombre, nombre_enf in cur.fetchall() :
        print (id_nombre, nombre_enf)

myConnection = mysql.connector.connect( host=hostname, user=username, passwd=password, db=database )
doQuery( myConnection )
myConnection.close()


1 22
2 23
3 24
3 25
3 26
3 27
10 28
11 29
11 30
11 31
11 32
11 33
12 34
12 35
12 36
12 37
12 38
13 39
14 40
15 41
15 42
16 43
16 44
16 45
16 46
16 47
17 48
17 49
17 50
17 51
17 52
17 53
17 54
17 55
17 56
17 57
17 58
18 59
19 60
19 61
20 62
21 63
22 64
23 65
23 66
23 67
23 68
23 69
23 70
23 71
24 72
25 73
26 74
26 75
26 76
26 77
26 78
26 79
27 80
27 81
27 82
27 83
27 84
18 85
18 86
26 87
26 88
26 89
26 90
26 91
18 92
15 93
28 94
29 95
15 96
30 97
31 98
31 99
32 100
33 101
34 102
34 103
34 104
11 105
35 106
35 107
35 108
35 109
35 110
36 111
36 112
36 113
37 114
37 115
38 116
38 117
39 118
40 119
40 120
40 121
41 122
42 123
43 124
43 125
43 126
43 127
44 128
44 129
44 130
45 131
46 132
47 133
47 134
47 135
48 136
48 137
49 138
49 139
50 140
50 141
50 142
51 143
51 144
52 145
52 146
53 147
13 148
13 149
13 150
13 151
13 152
17 153
17 154
17 155
17 156
17 157
54 158
54 159
55 160
55 161
56 162
56 163
57 164
57 165
57 166
58 167
58 168
58 169
58 170
59 171
60 172
60 173
61 174
62 175
62 176
63 177
63 178
64 179
64 180
64 181
65 182
65 183
66 184
66 185
28 186
30 187
67 188
67 189
68 190
68 191
68 192
27 193
69 194
70 195
71 196
72 197
72 198
72 199
72 200
72 201
73 202
74 203
74 204
74 205
75 206
76 207
76 208
75 209
75 210
75 211
77 212
24 213
11 214
11 215
78 216
17 217
17 218
16 219
79 220
79 221
79 222
79 223
79 224
76 225
76 226
76 227
76 228
67 229
67 230
67 231
67 232
67 233
80 234
80 235
30 236
30 237
30 238
30 239
24 240
24 241
24 242
24 243
24 244
12 245
12 246
12 247
12 248
12 249
67 250
67 251
67 252
67 253
67 254
24 255
24 256
24 257
24 258
24 259
13 260
13 261
13 262
13 263
13 264
13 265
13 266
13 267
79 268
81 269
81 270
81 271
82 272
12 273
12 274
67 275
67 276
36 277
17 278
17 279
14 280
67 281
28 282
28 283
28 284
28 285
28 286
12 287
10 288
10 289
83 290
84 291
84 292
84 293
85 294
84 295
86 296
78 297
17 298
87 299
42 300
42 301
88 302
32 303
89 304
90 305
90 306
91 307
91 308
92 309
93 310
91 311
91 312
94 313
36 314
95 315
95 316
95 317
95 318
95 319
87 320
87 321
87 322
87 323
87 324
93 325
90 326
96 327
96 328
96 329
97 330
97 331
97 332
97 333
91 334
52 335
98 336
98 337
98 338
98 339
98 340
99 341
52 342
100 343
31 344
101 345
2 346
102 347
90 348
96 349
33 350
103 351
67 352
67 353
67 354
67 355
67 356
104 357
105 358
12 359
81 360
106 361
106 362
106 363
107 364
107 365
107 366
108 367
75 368
75 369
30 370
14 371
14 372
14 373
23 374
23 375
23 376
23 377
23 378
65 379
109 380
110 381
110 382
110 383
110 384
110 385
110 386
111 387
111 388
23 389
23 390
23 391
30 392
107 393
107 394
106 395
67 396
67 397
67 398
67 399
67 400
67 401
67 402
67 403
67 404
67 405
67 406
67 407
67 408
67 409
67 410
67 411
67 412
67 413
67 414
67 415
67 416
67 417
67 418
67 419
67 420
99 421
99 422
98 423
94 424
94 425
94 426
87 427
87 428
87 429
87 430
112 431
12 432
28 433
67 434
17 435
17 436
88 437
12 438
12 439
12 440
12 441
12 442
13 443
24 444
24 445
24 446
67 447
67 448
67 449
67 450
79 451
79 452
67 453
67 454
67 455
67 456
67 457
67 458
67 459
67 460
30 461
30 462
30 463
24 464
24 465
24 466
24 467
24 468
12 469
12 470
16 471
12 472
12 473
24 474
24 475
24 476
76 477
76 478
72 479
67 480
67 481
30 482
30 483
64 484
64 485
64 486
64 487
64 488
64 489
64 490
62 491
62 492
60 493
60 494
60 495
60 496
60 497
60 498
59 499
59 500
59 501
59 502
58 503
58 504
58 505
58 506
58 507
58 508
58 509
58 510
58 511
58 512
57 513
57 514
57 515
57 516
57 517
56 518
13 519
13 520
13 521
13 522
65 523
66 524
66 525
66 526
66 527
66 528
66 529
66 530
66 531
66 532
66 533
66 534
66 535
66 536
66 537
66 538
66 539
51 540
51 541
52 542
52 543
52 544
52 545
43 546
43 547
43 548
43 549
43 550
43 551
43 552
43 553
43 554
43 555
43 556
43 557
42 558
42 559
42 560
42 561
42 562
48 563
48 564
48 565
48 566
48 567
48 568
48 569
48 570
48 571
48 572
48 573
41 574
35 575
35 576
35 577
35 578
35 579
35 580
35 581
35 582
35 583
35 584
35 585
35 586
35 587
35 588
35 589
35 590
91 591
91 592
91 593
91 594
91 595
91 596
91 597
91 598
91 599
91 600
91 601
91 602
91 603
91 604
91 605
91 606
91 607
91 608
91 609
91 610
91 611
91 612
91 613
91 614
91 615
91 616
91 617
91 618
91 619
91 620
91 621
91 622
91 623
91 624
91 625
91 626
91 627
91 628
91 629
91 630
91 631
91 632
91 633
91 634
47 635
47 636
47 637
47 638
47 639
47 640
18 641
18 642
26 643
26 644
26 645
12 646
12 647
12 648
12 649
12 650
12 651
12 652
12 653
12 654
12 655
12 656
95 657
95 658
95 659
79 660
79 661
79 662
79 663
79 664
79 665
79 666
79 667
79 668
79 669
79 670
79 671
79 672
79 673
79 674
79 675
79 676
79 677
79 678
79 679
79 680
28 681
112 682
112 683
112 684
112 685
112 686
112 687
24 688
24 689
24 690
24 691
113 692
90 693
90 694
90 695
96 696
96 697
96 698
36 699
2 700
2 701
11 702
11 703
11 704
2 705
16 706
16 707
16 708
14 709
17 710
17 711
17 712
17 713
17 714
17 715
17 716
17 717
17 718
17 719
18 720
18 721
23 722
23 723
114 724
115 725
67 726
67 727
30 728
115 729
26 730
26 731
26 732
26 733
26 734
23 735
23 736
23 737
23 738
23 739
23 740
23 741
25 742
25 743
25 744
25 745
25 746
26 747
26 748
26 749
26 750
33 751
33 752
31 753
31 754
116 755
32 756
32 757
117 758
50 759
50 760
50 761
50 762
50 763
50 764
50 765
49 766
49 767
49 768
12 769
12 770
103 771
103 772
97 773
97 774
67 775
18 776
18 777
18 778
18 779
18 780
26 781
26 782
26 783
26 784
118 785
107 786
107 787
107 788
28 789
28 790
42 791
42 792
67 793
67 794
76 795
76 796
76 797
3 798
119 799
119 800
30 801
30 802
30 803
30 804
30 805
30 806
30 807
30 808
30 809
14 810
14 811
14 812
14 813
14 814
14 815
120 816
121 817
122 818
13 819
71 820
71 821
71 822
122 823
122 824
122 825
122 826
122 827
122 828
24 829
24 830
24 831
24 832
24 833
123 834
123 835
124 836
124 837
115 838
115 839
15 840
15 841
15 842
125 843
125 844
125 845
126 846
126 847
3 848
3 849
3 850
127 851
24 852
24 853
24 854
24 855
24 856
24 857
128 858
128 859
128 860
128 861
128 862
128 863
128 864
128 865
128 866
128 867
128 868
11 869
11 870
11 871
11 872
24 873
24 874
24 875
24 876
24 877
24 878
24 879
24 880
129 881
129 882
129 883
129 884
27 885
72 886
72 887
72 888
72 889
72 890
72 891
72 892
72 893
72 894
130 895
130 896
130 897
130 898
130 899
130 900
131 901
72 902
28 903
18 904
18 905
132 906
132 907
132 908
132 909
132 910
115 911
115 912
3 913
3 914
72 915
133 916
52 917
134 918
134 919
134 920
134 921
134 922
134 923
134 924
134 925
134 926
134 927
134 928
134 929
134 930
134 931
134 932
134 933
134 934
134 935
134 936
134 937
134 938
134 939
134 940
134 941
134 942
134 943
135 944
135 945
135 946
135 947
135 948
135 949
136 950
136 951
136 952
136 953
27 954
137 955
137 956
137 957
137 958
137 959
137 960
137 961
137 962
137 963
137 964
137 965
137 966
137 967
137 968
137 969
137 970
36 971
36 972
36 973
36 974
36 975
36 976
36 977
36 978
36 979
36 980
36 981
138 982
138 983
138 984
138 985
138 986
138 987
138 988
138 989
138 990
138 991
138 992
138 993
138 994
138 995
138 996
26 997
26 998
26 999
26 1000
26 1001
136 1002
25 1003
25 1004
18 1005
18 1006
12 1007
12 1008
26 1009
26 1010
26 1011
18 1012
18 1013
18 1014
18 1015
18 1016
18 1017
18 1018
18 1019
18 1020
18 1021
25 1022
25 1023
25 1024
25 1025
25 1026
25 1027
25 1028
25 1029
136 1030
136 1031
136 1032
136 1033
136 1034
136 1035
136 1036
136 1037
136 1038
136 1039
25 1040
25 1041
25 1042
25 1043
18 1044
18 1045
18 1046
18 1047
18 1048
18 1049
139 1050
139 1051
139 1052
139 1053
139 1054
139 1055
139 1056
139 1057
139 1058
139 1059
139 1060
25 1061
18 1062
18 1063
18 1064
18 1065
18 1066
25 1067
25 1068
25 1069
25 1070
25 1071
25 1072
26 1073
26 1074
26 1075
18 1076
18 1077
18 1078
18 1079
18 1080
18 1081
18 1082
25 1083
25 1084
25 1085
25 1086
25 1087
25 1088
25 1089
25 1090
26 1091
26 1092
26 1093
26 1094
26 1095
26 1096
26 1097
26 1098
26 1099
140 1100
140 1101
140 1102
140 1103
140 1104
140 1105
140 1106
140 1107
141 1108
141 1109
141 1110
141 1111
141 1112
141 1113
141 1114
141 1115
141 1116
141 1117
141 1118
142 1119
142 1120
142 1121
142 1122
142 1123
143 1124
143 1125
143 1126
143 1127
144 1128
144 1129
144 1130
145 1131
145 1132
145 1133
12 1134
12 1135
12 1136
12 1137
144 1138
144 1139
144 1140
12 1141
12 1142
47 1143
135 1144
72 1145
144 1146
144 1147
144 1148
144 1149
144 1150
144 1151
144 1152
144 1153
144 1154
144 1155
144 1156
144 1157
144 1158
144 1159
144 1160
144 1161
144 1162
144 1163
97 1164
97 1165
97 1166
97 1167
97 1168
97 1169
97 1170
95 1171
95 1172
115 1173
95 1174
95 1175
95 1176
95 1177
95 1178
97 1179
105 1180
146 1181
144 1182
144 1183
144 1184
115 1185
124 1186
124 1187
124 1188
124 1189
124 1190
124 1191
124 1192
124 1193
146 1194
146 1195
147 1196
147 1197
115 1198
115 1199
115 1200
148 1201
2 1202
2 1203
149 1204
150 1205
150 1206
151 1207
151 1208
2 1209
2 1210
2 1211
2 1212
2 1213
2 1214
2 1215
2 1216
2 1217
52 1218
52 1219
52 1220
124 1221
124 1222
124 1223
124 1224
152 1225
152 1226
152 1227
152 1228
152 1229
152 1230
152 1231
152 1232
152 1233
152 1234
152 1235
152 1236
153 1237
154 1238
154 1239
154 1240
17 1241
155 1242
156 1243
156 1244
156 1245
156 1246
156 1247
156 1248
156 1249
156 1250
156 1251
156 1252
156 1253
156 1254
156 1255
156 1256
156 1257
156 1258
156 1259
157 1260
157 1261
158 1262
158 1263
159 1264
159 1265
159 1266
159 1267
159 1268
159 1269
159 1270
76 1271
76 1272
160 1273
160 1274
160 1275
160 1276
160 1277
160 1278
160 1279
160 1280
160 1281
161 1282
145 1283
162 1284
155 1285
155 1286
161 1287
13 1288
42 1289
42 1290
163 1291
164 1292
164 1293
164 1294
165 1295
165 1296
166 1297
167 1298
99 1299
99 1300
168 1301
168 1302
85 1303
85 1304
85 1305
85 1306
85 1307
85 1308
85 1309
85 1310
85 1311
85 1312
85 1313
85 1314
85 1315
164 1316
164 1317
164 1318
164 1319
164 1320
164 1321
164 1322
164 1323
164 1324
164 1325
164 1326
164 1327
164 1328
164 1329
166 1330
99 1331
99 1332
165 1333
165 1334
165 1335
165 1336
165 1337
165 1338
165 1339
165 1340
165 1341
165 1342
85 1343
169 1344
169 1345
169 1346
170 1347
169 1348
169 1349
171 1350
64 1351
76 1352
76 1353
80 1354
85 1355
85 1356
85 1357
85 1358
85 1359
85 1360
85 1361
85 1362
85 1363
85 1364
85 1365
12 1366
13 1367
13 1368
13 1369
13 1370
13 1371
13 1372
12 1373
12 1374
12 1375
172 1376
172 1377
172 1378
172 1379
172 1380
172 1381
172 1382
172 1383
172 1384
172 1385
173 1386
174 1387
175 1388
27 1389
27 1390
27 1391
27 1392
27 1393
27 1394
27 1395
27 1396
27 1397
27 1398
27 1399
27 1400
27 1401
27 1402
27 1403
27 1404
27 1405
27 1406
27 1407
27 1408
176 1409
25 1410
177 1411
177 1412
27 1413
27 1414
27 1415
27 1416
27 1417
177 1418
177 1419
177 1420
177 1421
12 1422
12 1423
12 1424
12 1425
12 1426
12 1427
12 1428
123 1429
123 1430
123 1431
123 1432
123 1433
123 1434
123 1435
123 1436
123 1437
123 1438
123 1439
123 1440
178 1441
178 1442
178 1443
178 1444
178 1445
178 1446
178 1447
178 1448
178 1449
179 1450
179 1451
179 1452
179 1453
179 1454
179 1455
179 1456
179 1457
177 1458
177 1459
180 1460
180 1461
180 1462
180 1463
180 1464
180 1465
180 1466
180 1467
180 1468
123 1469
123 1470
123 1471
123 1472
123 1473
123 1474
123 1475
123 1476
123 1477
123 1478
123 1479
123 1480
123 1481
123 1482
123 1483
123 1484
123 1485
123 1486
123 1487
123 1488
123 1489
123 1490
123 1491
123 1492
123 1493
123 1494
181 1495
181 1496
181 1497
182 1498
120 1499
123 1500
123 1501
123 1502
123 1503
123 1504
123 1505
123 1506
178 1507
178 1508
178 1509
178 1510
178 1511
179 1512
179 1513
179 1514
179 1515
179 1516
179 1517
179 1518
181 1519
181 1520
181 1521
181 1522
180 1523
180 1524
183 1525
184 1526
184 1527
184 1528
184 1529
184 1530
184 1531
184 1532
184 1533
79 1534
79 1535
79 1536
79 1537
79 1538
79 1539
79 1540
79 1541
79 1542
79 1543
79 1544
79 1545
79 1546
185 1547
185 1548
185 1549
186 1550
186 1551
186 1552
186 1553
186 1554
186 1555
80 1556
80 1557
80 1558
80 1559
185 1560
185 1561
185 1562
185 1563
185 1564
185 1565
185 1566
186 1567
186 1568
186 1569
186 1570
186 1571
186 1572
186 1573
184 1574
187 1575
73 1576
184 1577
184 1578
24 1579
24 1580
24 1581
24 1582
24 1583
24 1584
24 1585
24 1586
24 1587
24 1588
24 1589
24 1590
24 1591
24 1592
24 1593
24 1594
24 1595
24 1596
24 1597
24 1598
24 1599
24 1600
24 1601
24 1602
24 1603
24 1604
24 1605
24 1606
24 1607
24 1608
24 1609
24 1610
24 1611
24 1612
24 1613
24 1614
24 1615
24 1616
188 1617
188 1618
188 1619
188 1620
189 1621
189 1622
189 1623
189 1624
190 1625
190 1626
190 1627
191 1628
191 1629
191 1630
191 1631
191 1632
191 1633
183 1634
192 1635
193 1636
193 1637
193 1638
193 1639
193 1640
193 1641
194 1642
195 1643
195 1644
196 1645
79 1646
30 1647
30 1648
30 1649
197 1650
197 1651
197 1652
197 1653
197 1654
197 1655
198 1656
198 1657
198 1658
198 1659
198 1660
30 1661
30 1662
122 1663
122 1664
122 1665
122 1666
122 1667
122 1668
122 1669
122 1670
122 1671
196 1672
196 1673
196 1674
122 1675
122 1676
12 1677
12 1678
12 1679
12 1680
12 1681
12 1682
12 1683
199 1684
199 1685
199 1686
199 1687
199 1688
199 1689
199 1690
199 1691
199 1692
199 1693
199 1694
199 1695
199 1696
199 1697
199 1698
199 1699
199 1700
199 1701
199 1702
199 1703
12 1704
12 1705
12 1706
12 1707
199 1708
199 1709
199 1710
199 1711
199 1712
199 1713
199 1714
199 1715
199 1716
12 1717
12 1718
12 1719
12 1720
12 1721

In [27]:
def get_diseaseId(disease_name):
    cur = myConnection.cursor()

    cur.execute( """select * from enfermedad where nombre = "%s" """ % (disease_name) )
    
    id_enf = None
    for id_, dissease in cur.fetchall() :
        id_enf = id_
        
    if not id_enf:
        #print("""insert into enfermedad values (NULL, "%s" )""" % (disease_name))
        cur.execute("""insert into enfermedad values (NULL, "%s" )""" % (disease_name))
        cur.execute("SELECT LAST_INSERT_ID()")
        
        id_enf = cur.fetchall()[0][0]
        
        myConnection.commit()
        
    return id_enf

In [28]:
def get_platId(plat_name):
    cur = myConnection.cursor()

    cur.execute( """select * from plataforma where nombre = "%s" """ % (plat_name) )
    
    id_plat = None
    for id_, nombre_plat in cur.fetchall() :
        id_plat = id_
        
    if not id_plat:
        #print("""insert into plataforma values (NULL, "%s" )""" % (plat_name))
        cur.execute("""insert into plataforma values (NULL, "%s" )""" % (plat_name))
        cur.execute("SELECT LAST_INSERT_ID()")
        
                
        id_plat = cur.fetchall()[0][0]
        
        myConnection.commit()
        
    return id_plat

In [29]:
def get_jourId(jour_name):
    cur = myConnection.cursor()

    cur.execute( """select * from journal where nombre = "%s" """ % (jour_name) )
    
    id_jour = None
    for id_, nombre_jour in cur.fetchall() :
        id_jour = id_
        
    if not id_jour:
        #print("""insert into journal values (NULL, "%s" )""" % (jour_name))
        cur.execute("""insert into journal values (NULL, "%s" )""" % (jour_name))
        cur.execute("SELECT LAST_INSERT_ID()")
        
                
        id_jour = cur.fetchall()[0][0]
        
        myConnection.commit()
        
    return id_jour

In [30]:
def get_estId(id_journal, id_enfermedad, est_name, tam_mues, replica):
    cur = myConnection.cursor()

    cur.execute( """select * from estudio where nombre = "%s" """ % (est_name))
    
    id_est = None
    for id_, id_journal, id_enfermedad, nombre_est, tam_mues, replica in cur.fetchall() :
        id_est = id_
        
    if not id_est:
        #print("""insert into estudio values (NULL, "%s", "%s", "%s", "%s", "%s")""" % (id_journal, id_enfermedad,  est_name, tam_mues, replica))
        cur.execute("""insert into estudio values (NULL, "%s", "%s", "%s", "%s", "%s")""" % ( id_journal,id_enfermedad,est_name, tam_mues, replica))
        cur.execute("SELECT LAST_INSERT_ID()")
    
               
        id_est = cur.fetchall()[0][0]
    
        myConnection.commit()
        
    return id_est

In [31]:
def get_locId(id_plataforma, region, chrom, pos, genes_reportados, gen_mapped, gen_upstream,gen_downstream, SNP_GENE_ID, UPSTREAM_GENE_DISTANCE, DOWNSTREAM_GENE_DISTANCE, STRONGEST_SP_RISK, SNPS, MERGED, SNP_ID_CURRENT, CONTEXTO, risk_allele):
    cur = myConnection.cursor()

    cur.execute( """select * from loci where chrom = "%s" and pos="%s" """ % (chrom,pos))
    
    id_loc = None
    for id_,id_plataforma, region, chrom, pos, genes_reportados, gen_mapped, gen_upstream, gen_downstream, SNP_GENE_ID, UPSTREAM_GENE_DISTANCE, DOWNSTREAM_GENE_DISTANCE, STRONGEST_SP_RISK, SNPS, MERGED, SNP_ID_CURRENT, CONTEXTO, risk_allele in cur.fetchall() :
        id_est = id_
        
    if not id_loc:
        #print("""insert into loci values (NULL, "%s","%s", "%s", "%s", "%s",
        #"%s","%s", "%s", "%s", "%s",
        #"%s","%s", "%s", "%s", "%s",
        #"%s","%s")""" % (id_plataforma, region,chrom,pos,genes_reportados,
        #                       gen_mapped, gen_upstream,gen_downstream, SNP_GENE_ID, 
        #                        UPSTREAM_GENE_DISTANCE, DOWNSTREAM_GENE_DISTANCE, 
        #                        STRONGEST_SP_RISK, SNPS, MERGED, SNP_ID_CURRENT, 
        #                        CONTEXTO, risk_allele))
        
        cur.execute("""insert into loci values (NULL, "%s","%s", "%s", "%s", "%s",
        "%s","%s", "%s", "%s", "%s",
        "%s","%s", "%s", "%s", "%s",
        "%s","%s")""" % (id_plataforma, region,chrom,pos,genes_reportados,
                                gen_mapped, gen_upstream,gen_downstream, SNP_GENE_ID, 
                                UPSTREAM_GENE_DISTANCE, DOWNSTREAM_GENE_DISTANCE, 
                                STRONGEST_SP_RISK, SNPS, MERGED, SNP_ID_CURRENT, 
                                CONTEXTO, risk_allele))
        cur.execute("SELECT LAST_INSERT_ID()")
    
               
        id_loc = cur.fetchall()[0][0]
    
        myConnection.commit()
        
    return id_loc

In [32]:
def get_pubId(id_journal, id_estudio, id_pubmed, autor, fecha_pub, link):
    cur = myConnection.cursor()

    cur.execute( """select * from publicacion where id_pubmed = "%s" """ % (id_pubmed))
    
    id_pub = None
    for id_, id_journal, id_estudio, idpubmed, autor, fecha_pub, link in cur.fetchall() :
        id_pub = id_
        
    if not id_pub:
        #print("""insert into publicacion values (NULL, "%s", "%s", "%s", "%s", "%s", "%s")""" % (id_journal, id_estudio, id_pubmed, autor, fecha_pub, link))
        cur.execute("""insert into publicacion values (NULL, "%s", "%s", "%s", "%s", "%s", "%s")""" % (id_journal, id_estudio, id_pubmed, autor, fecha_pub, link))
        cur.execute("SELECT LAST_INSERT_ID()")
    
               
        id_pub = cur.fetchall()[0][0]
    
        myConnection.commit()
        
    return id_pub

In [33]:
def get_enfloId(loci_id, id_enfermedad,):
    cur=myConnection.cursor()
    
    cur.execute( """select * from enfermedad_loci where id_loci = "%s" and id_enfermedad = "%s" """ % (id_loci, id_enfermedad))
    
    existe = None
    for id_, loci_id, id_enfermedad in cur.fetchall() :
        existe = True
        
    if not existe:
        #print("""insert into enfermedad_loci values ("%s","%s")""" % (loci_id, id_enfermedad))
        cur.execute("""insert into enfermedad_loci values ("%s", "%s")""" % (loci_id, id_enfermedad))
        myConnection.commit()

In [35]:
hostname = '127.0.0.1'
username = 'alexacl95'
password = 'SUSAna05'
database = 'programacion'

myConnection = mysql.connector.connect(host=hostname, user=username, passwd=password, db=database )
cur=myConnection.cursor()

for index, row in df.iterrows():
    #identidad enfermedad
    dissease_name = row['DISEASE/TRAIT']
    dissease_id = get_diseaseId(dissease_name)
        
    #identidad plataforma
    plat_name = row['PLATFORM [SNPS PASSING QC]']
    plat_id = get_platId(plat_name)
    
    #identidad loci
    id_plataforma = plat_id
    region=row['REGION']
    chrom=row['CHR_ID']
    pos=row['CHR_POS']
    genes_reportados=row['REPORTED GENE(S)']
    gen_mapped=row['MAPPED_GENE']
    gen_upstream=row['UPSTREAM_GENE_ID']
    gen_downstream=row['DOWNSTREAM_GENE_ID']
    SNP_GENE_ID=row['SNP_GENE_IDS']
    UPSTREAM_GENE_DISTANCE=row['UPSTREAM_GENE_DISTANCE']
    DOWNSTREAM_GENE_DISTANCE=row['DOWNSTREAM_GENE_DISTANCE']
    STRONGEST_SP_RISK=row['STRONGEST SNP-RISK ALLELE']
    SNPS=row['SNPS']
    MERGED=row['MERGED']
    SNP_ID_CURRENT=row['SNP_ID_CURRENT']
    CONTEXTO=row['CONTEXT']
    risk_allele=row['RISK ALLELE FREQUENCY']
    #PVAL=row['P-VALUE']
    #Pvalue_MLOG=row['PVALUE_MLOG']
    #PVALUE_txt=row['P-VALUE (TEXT)']
    #BETA=row['OR or BETA']
    #novCI= row['95% CI (TEXT)']
    id_loci= get_locId(id_plataforma, region,
                       chrom,pos,genes_reportados, 
                       gen_mapped, gen_upstream,gen_downstream,
                       SNP_GENE_ID, UPSTREAM_GENE_DISTANCE,
                       DOWNSTREAM_GENE_DISTANCE,
                       STRONGEST_SP_RISK, SNPS, MERGED, 
                        SNP_ID_CURRENT, CONTEXTO, risk_allele)
   
    #identidad loci enfermedad
    id_enfermedad=dissease_id 
    loci_id=id_loci 
    get_enfloId(loci_id, id_enfermedad)
    
    #identidad journal
    jour_name = row['JOURNAL']
    jour_id = get_jourId(jour_name)
    
    #identidad estudio
    id_enfermedad=dissease_id
    id_journal=jour_id
    est_name =  row['STUDY']
    tam_mues = row['INITIAL SAMPLE SIZE']
    replica = row['REPLICATION SAMPLE SIZE']
    est_id = get_estId(id_journal, id_enfermedad, est_name, tam_mues, replica)
    
    #identidad publicacion
    id_journal=jour_id
    id_estudio=est_id
    idpubmed= row['PUBMEDID']
    autor= row['FIRST AUTHOR']
    fecha_pub= row['DATE ADDED TO CATALOG']
    link= row['LINK']
    id_publicacion = get_pubId(id_journal, id_estudio, idpubmed, autor, fecha_pub, link)
    
                  
myConnection.close()


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-35-bf2a2d008b78> in <module>()
     50     id_enfermedad=dissease_id
     51     loci_id=id_loci
---> 52     get_enfloId(loci_id, id_enfermedad)
     53 
     54     #identidad journal

<ipython-input-33-ec50b5916f8e> in get_enfloId(loci_id, id_enfermedad)
     11         #print("""insert into enfermedad_loci values ("%s","%s")""" % (loci_id, id_enfermedad))
     12         cur.execute("""insert into enfermedad_loci values ("%s", "%s")""" % (loci_id, id_enfermedad))
---> 13         myConnection.commit()

C:\Users\Alex\Anaconda3\lib\site-packages\mysql\connector\connection.py in commit(self)
   1476     def commit(self):
   1477         """Commit current transaction"""
-> 1478         self._execute_query("COMMIT")
   1479 
   1480     def rollback(self):

C:\Users\Alex\Anaconda3\lib\site-packages\mysql\connector\connection.py in _execute_query(self, query)
   1497             raise errors.InternalError("Unread result found.")
   1498 
-> 1499         self.cmd_query(query)
   1500 
   1501     def _info_query(self, query):

C:\Users\Alex\Anaconda3\lib\site-packages\mysql\connector\connection.py in cmd_query(self, query)
    720         if not isinstance(query, bytes):
    721             query = query.encode('utf-8')
--> 722         result = self._handle_result(self._send_cmd(ServerCmd.QUERY, query))
    723 
    724         if self._have_next_result:

C:\Users\Alex\Anaconda3\lib\site-packages\mysql\connector\connection.py in _send_cmd(self, command, argument, packet_number, packet, expect_response)
    508         if not expect_response:
    509             return None
--> 510         return self._socket.recv()
    511 
    512     def _send_data(self, data_file, send_empty_packet=False):

C:\Users\Alex\Anaconda3\lib\site-packages\mysql\connector\network.py in recv_plain(self)
    224             packet_len = 0
    225             while packet_len < 4:
--> 226                 chunk = self.sock.recv(4 - packet_len)
    227                 if not chunk:
    228                     raise errors.InterfaceError(errno=2013)

KeyboardInterrupt: 

Ejercicio 3

Realize de la base de datos una consulta que le responda una pregunta biológica (e.g. qué genes estan relacionados con cuales enfermedades)


In [70]:
#¿Cual es el numero de locus afectados para n enfermedades? con ejemplo para 4 enfermedades

hostname = '127.0.0.1'
username = 'alexacl95'
password = 'SUSAna05'
database = 'programacion'

myConnection = mysql.connector.connect(host=hostname, user=username, passwd=password, db=database )


def enfermedades_info_1(nombres_enfermedades):
    cur=myConnection.cursor()
    cromosoma=set()
    regionafec= set()
    posi=set()
    N=len(nombres_enfermedades)
    lol = []
    num=0
    count=0
        
    for i in range(len(nombres_enfermedades)):
        cur.execute("""select * FROM enfermedad where nombre= "%s" """ % nombres_enfermedades[i])
        for id_, nombre_enf in cur.fetchall() :
            print ("***Enfermedad escogida: " + nombre_enf)
            print ("- Codigo enfermedad: " )
            print (id_)
    
        num=0
        cur.execute("""select * FROM enfermedad_loci where id_enfermedad= "%s" """  % id_)
        for id_2, cod_enf in cur.fetchall() :
            num=num + 1
            
        
                   
                    
                                
          
        print("- numero de locus afectados:")
        print (num)
        print()
        print()
   
    
enfermedades=("Atopic dermatitis",
              "Allergic rhinitis in asthma",
              "Asthma", "Allergic rhinitis")
enfermedades_info_1(enfermedades)
myConnection.close()


***Enfermedad escogida: Atopic dermatitis
- Codigo enfermedad: 
173
- numero de locus afectados:
4


***Enfermedad escogida: Allergic rhinitis in asthma
- Codigo enfermedad: 
719
- numero de locus afectados:
1


***Enfermedad escogida: Asthma
- Codigo enfermedad: 
73
- numero de locus afectados:
82


***Enfermedad escogida: Allergic rhinitis
- Codigo enfermedad: 
290
- numero de locus afectados:
19


Ejercicio 4

Guarde el resultado de la consulta anterior en un archivo csv


In [74]:
hostname = '127.0.0.1'
username = 'alexacl95'
password = 'SUSAna05'
database = 'programacion'

myConnection = mysql.connector.connect(host=hostname, user=username, passwd=password, db=database )

def enfermedades_info_2(nombres_enfermedades):
    cur=myConnection.cursor()
    cromosoma=set()
    N= len(nombres_enfermedades)+1
    lol = [[] for _ in range(N)]
    lol[0].append("enfermedades")
    lol[0].append("Codigo enfermedad") 
    lol[0].append("numero de locus afectados") 
    for i in range(len(nombres_enfermedades)):
        cur.execute("""select * FROM enfermedad where nombre= "%s" """ % nombres_enfermedades[i])
        for id_, nombre_enf in cur.fetchall() :
                lol[i+1].append(nombre_enf)
                lol[i+1].append(id_)
        
        num=0
        cur.execute("""select * FROM enfermedad_loci where id_enfermedad= "%s" """  % id_)
        for id_2, cod_enf in cur.fetchall() :
            num=num + 1
                
              
        lol[i+1].append(num)
        
       
       
        import csv
        with open('enfermedades_c.csv', 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerows(lol)
                  
       
    return lol

enfermedades=("Asthma", "Allergic rhinitis", 
              "Allergic rhinitis in asthma",
             "Atopic dermatitis")
enfermedades_info_2(enfermedades)


Out[74]:
[['enfermedades', 'Codigo enfermedad', 'numero de locus afectados'],
 ['Asthma', 73, 82],
 ['Allergic rhinitis', 290, 19],
 ['Allergic rhinitis in asthma', 719, 1],
 ['Atopic dermatitis', 173, 4]]